package com.yzq.os.spider.v;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;

import com.yzq.os.spider.v.service.http.HttpClientService;
import com.yzq.os.spider.v.util.Encode;

/**
 * 添加新网站前，从chrome浏览器的请求日志，通过程序模拟提交，用于验证配置信息
 * 
 * @author 苑志强(xingyu_yzq@163.com)
 * 
 */
public class WebSiteCrawlTest {

	/**
	 * Chrome浏览器请求日志中：基本值
	 */
	private static final List<String> REQUEST_BASE = new ArrayList<String>();
	static {
		REQUEST_BASE.add("Remote Address");
		REQUEST_BASE.add("Request URL");
		REQUEST_BASE.add("Request Method");
		REQUEST_BASE.add("Status Code");
	}

	public static void main(String[] args) throws Exception {
		String filePath = "/example/www.liepin.com.txt";
		WebSiteCrawlTest crawlTest = new WebSiteCrawlTest();
		crawlTest.doTest(filePath);
		crawlTest.testUrlDecode("%E4%B8%AD%E6%96%87");
		crawlTest.testUrlEncode("中文");
	}

	/**
	 * 根据chrome请求日志数据，系统模拟执行一次请求，将请求结果写入测试文件
	 * 
	 * @param filePath
	 * @throws Exception
	 */
	public void doTest(String filePath) throws Exception {
		Resource res = new ClassPathResource(filePath);
		List<String> lines = FileUtils.readLines(res.getFile());// 读取文件全部行
		lines = filterNotes(lines);// 过滤注释行
		List<NameValuePair> requestNameValueList = conversionDataLines(lines);
		String fullURL = getValueForName(requestNameValueList, "Request URL");
		fullURL = StringUtils.substringBefore(fullURL, "?");
		String method = getValueForName(requestNameValueList, "Request Method");
		String encode = "UTF-8";
		System.out.println("-------------base--------------------");
		System.out.println("Base url:[" + fullURL + "]");
		System.out.println("Method:[" + method + "]");
		System.out.println("Url encode:[" + encode + "]");
		List<NameValuePair> parameters = filterRequestBases(requestNameValueList);
		System.out.println("-------------parameters-------------");
		if (CollectionUtils.isNotEmpty(parameters)) {
			for (NameValuePair pair : parameters) {
				System.out.println("pName[" + pair.getName() + "],pValue["
						+ pair.getValue() + "]");
			}
		}
		System.out.println("--------------do request------------");
		HttpClientService service = new HttpClientService();
		String html = null;
		if (StringUtils.equalsIgnoreCase(method, "POST")) {
			List<Header> headers = new ArrayList<Header>();
			headers.add(new BasicHeader("Referer", "http://www.baidu.com"));
			html = service.doPostRequest(fullURL, Encode.GB18030, true,
					headers, parameters, encode);
		} else if (StringUtils.equalsIgnoreCase(method, "GET")) {
			String url = makeGetUrl(fullURL, parameters, encode);
			System.out.println("Get url:[" + url + "]");
			html = service.doGetRequest(url, true);
		}
		System.out.println("-------------response--------------");
		System.out.println();
		System.out.println("html[" + html + "]");
		System.out.println("-------------end-------------------");
	}

	/**
	 * 过滤掉请求基本值，来获取请求参数
	 * 
	 * @param lines
	 * @return
	 */
	private List<NameValuePair> filterRequestBases(List<NameValuePair> lines) {
		if (CollectionUtils.isNotEmpty(lines)) {
			CollectionUtils.filter(lines, new Predicate() {

				@Override
				public boolean evaluate(Object obj) {
					NameValuePair pair = (NameValuePair) obj;
					String name = pair.getName();
					boolean result = true;
					for (String baseName : REQUEST_BASE) {
						if (StringUtils.equalsIgnoreCase(baseName, name)) {
							result = false;
							break;
						}
					}
					return result;
				}
			});
		}
		return lines;
	}

	/**
	 * 从请求日志文件中去除掉注释行
	 * 
	 * @param lines
	 * @return
	 */
	private List<String> filterNotes(List<String> lines) {
		List<String> returnList = new ArrayList<String>();
		for (String line : lines) {
			if (!StringUtils.startsWith(line, "//")) {
				returnList.add(line);
			}
		}
		return returnList;
	}

	/**
	 * 将行数据转换成KV数据保存，通过第一个:分割
	 * 
	 * @param lines
	 * @return
	 */
	private List<NameValuePair> conversionDataLines(List<String> lines) {
		List<NameValuePair> pairs = new ArrayList<NameValuePair>();
		if (CollectionUtils.isNotEmpty(lines)) {
			for (String line : lines) {
				String name = getName(line);
				String value = getValue(line);
				pairs.add(new BasicNameValuePair(name, value));
			}
		}
		return pairs;
	}

	/**
	 * 获取一行数据的第一个“:”之前部分，当做name
	 * 
	 * @param line
	 * @return
	 */
	private String getName(String line) {
		String name = null;
		if (StringUtils.isNotBlank(line)) {
			String[] parts = StringUtils.split(line, ":", 2);
			name = parts[0];
		}
		return name;
	}

	/**
	 * 获取一行数据的第一个“:”之后部分，当做value,如果不存在返回空字符串
	 * 
	 * @param line
	 * @return
	 */
	private String getValue(String line) {
		String value = null;
		if (StringUtils.isNotBlank(line)) {
			String[] parts = StringUtils.split(line, ":", 2);
			value = parts.length > 1 ? parts[1] : "";
		}
		return value;
	}

	/**
	 * 获取指定name的value值
	 * 
	 * @param pairs
	 * @param name
	 * @return
	 */
	private String getValueForName(List<NameValuePair> pairs, String name) {
		String value = "";
		if (CollectionUtils.isNotEmpty(pairs) && StringUtils.isNotBlank(name)) {
			for (NameValuePair pair : pairs) {
				if (StringUtils.equalsIgnoreCase(name, pair.getName())) {
					value = pair.getValue();
					break;
				}
			}
		}
		return value;
	}

	/**
	 * 构建GET请求URL字符串
	 * 
	 * @param baseUrl
	 * @param params
	 * @param encode
	 * @return
	 */
	private String makeGetUrl(String baseUrl, List<NameValuePair> params,
			String encode) {
		String returnValue = null;
		String queryStr = URLEncodedUtils.format(params, encode);
		if (StringUtils.contains(baseUrl, "?")) {
			returnValue = baseUrl + "&" + queryStr;
		} else {
			returnValue = baseUrl + "?" + queryStr;
		}
		return returnValue;
	}

	/**
	 * 测试编码类型,判断是那种编码
	 * 
	 * @param input
	 * @throws UnsupportedEncodingException
	 */
	private void testUrlEncode(String input)
			throws UnsupportedEncodingException {
		List<String> encodes = new ArrayList<String>();
		encodes.add("ISO-8859-1");
		encodes.add("GBK");
		encodes.add("GB2312");
		encodes.add("UTF-8");
		encodes.add("GB18030");
		for (String encode : encodes) {
			String output = URLEncoder.encode(input, encode);
			System.out.println("input[" + input + "],encode:[" + encode
					+ "],output:[" + output + "]");
		}
	}

	/**
	 * 测试解码类型，判断是那种编码
	 * 
	 * @param input
	 * @throws UnsupportedEncodingException
	 */
	private void testUrlDecode(String input)
			throws UnsupportedEncodingException {
		List<String> encodes = new ArrayList<String>();
		encodes.add("ISO-8859-1");
		encodes.add("GBK");
		encodes.add("GB2312");
		encodes.add("UTF-8");
		encodes.add("GB18030");
		for (String encode : encodes) {
			String output = URLDecoder.decode(input, encode);
			System.out.println("input[" + input + "],encode:[" + encode
					+ "],output:[" + output + "]");
		}
	}

}
